Samiksha Rajpal - 8908982¶

Lab Assignment - 3¶

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
In [9]:
from sklearn import datasets
features, tar = datasets.load_diabetes(return_X_y=True)
In [10]:
from sklearn import datasets
import pandas as pd

diabetes = datasets.load_diabetes()

diabetes_df = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)

diabetes_df['target'] = diabetes.target
In [11]:
diabetes_df.head()
Out[11]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0
In [12]:
diabetes_df = diabetes_df.rename(columns={'s1':'total serum cholesterol','s2':'low-density lipoproteins', 's3':'high-density lipoproteins','s4':'total cholesterol / HDL','s5':'possibly log of serum triglycerides level','s6':'blood sugar level'})
In [13]:
diabetes_df.head()
Out[13]:
age sex bmi bp total serum cholesterol low-density lipoproteins high-density lipoproteins total cholesterol / HDL possibly log of serum triglycerides level blood sugar level target
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0
In [14]:
diabetes_df.shape
Out[14]:
(442, 11)
In [15]:
diabetes_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   age                                        442 non-null    float64
 1   sex                                        442 non-null    float64
 2   bmi                                        442 non-null    float64
 3   bp                                         442 non-null    float64
 4   total serum cholesterol                    442 non-null    float64
 5   low-density lipoproteins                   442 non-null    float64
 6   high-density lipoproteins                  442 non-null    float64
 7   total cholesterol / HDL                    442 non-null    float64
 8   possibly log of serum triglycerides level  442 non-null    float64
 9   blood sugar level                          442 non-null    float64
 10  target                                     442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB
In [16]:
print(diabetes_df['target'].unique())
print(diabetes_df['target'].value_counts())
[151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144. 168.  68.  49. 245. 184. 202. 137.  85. 131. 283.
 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.  61.  92.
 259.  53. 190. 142. 155. 225. 104. 182. 128.  52.  37. 170.  71. 163.
 150. 160. 178.  48. 270. 111.  42. 200. 113. 143.  51. 210. 134.  98.
 164.  96. 162. 279.  83. 302. 198.  95. 232.  81. 246. 297. 258. 229.
 275. 281. 173. 180.  84. 121. 161.  99. 109. 115. 268. 274. 158. 107.
 103. 272. 280. 336. 317. 235.  60. 174. 126. 288.  88. 292. 197. 186.
  25. 195. 217. 172. 214.  70. 220. 152.  47.  74. 295. 127. 237.  64.
  79.  91. 116.  86. 122.  72.  39. 196. 222. 277.  77. 191.  73. 263.
 248. 296.  78.  93. 208. 108. 154. 124.  67. 257. 262. 177. 187. 125.
 215. 303. 243. 153. 346.  89.  50. 308. 145.  45. 264. 241.  66.  94.
 230. 181. 156. 233. 219.  80. 332.  31. 236. 253.  44. 114. 147. 242.
 249. 192. 244. 199. 306. 216. 139. 148.  54. 221. 311. 321.  58. 123.
 167. 140.  40. 132. 201. 273.  43. 175. 293. 189. 209. 136. 261. 146.
 212. 120. 183.  57.]
target
200.0    6
72.0     6
90.0     5
178.0    5
71.0     5
        ..
73.0     1
222.0    1
86.0     1
79.0     1
57.0     1
Name: count, Length: 214, dtype: int64

Univariate Analysis¶

In [17]:
sns.jointplot(x='bp',y='bmi',data=diabetes_df,kind='hex')
Out[17]:
<seaborn.axisgrid.JointGrid at 0x2067eb0fbd0>

Above we are seeing relationship between bmi and bp¶

In [18]:
sns.jointplot(x='blood sugar level',y='age',data=diabetes_df,kind='reg')
Out[18]:
<seaborn.axisgrid.JointGrid at 0x2067ea9edd0>

Pair Plot¶

In [19]:
sns.pairplot(diabetes_df, hue='sex')
Out[19]:
<seaborn.axisgrid.PairGrid at 0x2067eb0b650>

Univariate Linear Regression: Building, Training and Predicting¶

In [20]:
diabetes_x, diabetes_y =  datasets.load_diabetes(return_X_y=True, as_frame=False)
diabetes_df = pd.DataFrame(diabetes_x, columns=datasets.load_diabetes().feature_names)
diabetes_df['target'] = diabetes_y
diabetes_X = diabetes_x[:, np.newaxis, 2]
diabetes_y = diabetes_df["target"]
X_train, X_test, y_train, y_test = train_test_split(diabetes_X, diabetes_y, test_size=0.2, random_state=0)
In [21]:
diabetes_model = LinearRegression()
diabetes_model.fit(X_train, y_train)
y_pred = diabetes_model.predict(X_test)
In [22]:
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='green', label='Test Data')
plt.scatter(X_train, y_train, color='blue', label='Trained Data')
plt.plot(X_test, y_pred, color='red', linewidth=4, label='Predicted')
plt.xlabel('Body Mass Index')
plt.ylabel('Disease Progression')
plt.title('Univariate : Linear Regression: Body Mass Index vs. Disease Progression')
plt.legend()
plt.show()

Details on the model¶

In [23]:
print("Coefficients : %.2f" % diabetes_model.coef_[0])
print("Intercept: %.2f" % diabetes_model.intercept_,"\n")

test_MSE = metrics.mean_squared_error(y_test, y_pred)
train_MSE = metrics.mean_squared_error(y_train, y_pred = diabetes_model.predict(X_train))


test_MAE = metrics.mean_absolute_error(y_test, y_pred)
train_MAE = metrics.mean_absolute_error(y_train, y_pred = diabetes_model.predict(X_train))


print(f"Mean Absolute Error of trained model : {test_MSE:.2f}")
print(f"Mean Absolute Error of Test model  : {train_MSE:.2f}","\n")

print(f"Mean Squared Error of trained model : {test_MAE:.2f}")
print(f"Mean Squared Error of Test model  : {train_MAE:.2f}")
Coefficients : 981.66
Intercept: 152.29 

Mean Absolute Error (Train) : 51.32
Mean Absolute Error (Test)  : 52.94 

Mean Squared Error (Train) : 3827.82
Mean Squared Error (Test)  : 4150.68

Observations :¶

We observe a Mean Absolute Error indicating that the model is not performing optimally. Linear regression is not suitable for this dataset due to its high level of dispersion